10  Data imputation

CautionStill under construction

This section is still under construction and will be completed in the near future. Please do not go beyond this point for now.

Real datasets often contain missing values or gaps due to various reasons such as sensor malfunctions, data corruption, or transmission errors. Handling these missing values is crucial for accurate analysis and modeling.

In this section we will explore common techniques for imputing missing data in time series.

11 Examples for time series imputation

The collapsed code cells include the imports, helper functions, and data loading.

Imports

Code
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots

pio.renderers.default = "notebook"  # set the default plotly renderer to "notebook" (necessary for quarto to render the plots)

Data loading

Code
df = pd.read_csv("https://archive.ics.uci.edu/static/public/601/ai4i+2020+predictive+maintenance+dataset.zip")

df.head()
UDI Product ID Type Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min] Machine failure TWF HDF PWF OSF RNF
0 1 M14860 M 298.1 308.6 1551 42.8 0 0 0 0 0 0 0
1 2 L47181 L 298.2 308.7 1408 46.3 3 0 0 0 0 0 0
2 3 L47182 L 298.1 308.5 1498 49.4 5 0 0 0 0 0 0
3 4 L47183 L 298.2 308.6 1433 39.5 7 0 0 0 0 0 0
4 5 L47184 L 298.2 308.7 1408 40.0 9 0 0 0 0 0 0

Extract process temperature and add gaps of random lenghts at random positions.

dftemp = df[["UDI", "Process temperature [K]"]].rename(columns={"Process temperature [K]": "process_temperature_K"})[:1000] 

dftemp["process_temperature_K"] += np.random.normal(loc=0, scale=0.1, size=len(dftemp))  # Add a little bit of noise to the process temperature (mean 0, std 0.5)
dftemp.loc[:, "process_temperature_K_gaps"] = dftemp["process_temperature_K"]

# Randomly select start indices and gap lengths
num_gaps = 50  # Number of gaps to introduce
gap_min = 3     # Minimum gap length
gap_max = 20    # Maximum gap length

n = len(dftemp)
rng = np.random.default_rng(seed=42)
starts = rng.choice(n - gap_max, size=num_gaps, replace=False)
lengths = rng.integers(gap_min, gap_max + 1, size=num_gaps)

for start, length in zip(starts, lengths):
    dftemp.loc[start:(start + length), "process_temperature_K_gaps"] = None

Helper function plot_temperature_data, which visualizes the original temperature data with and without gaps. If imputated column int_column is provided, it will be plotted as well alongside a difference plot.

def plot_temperature_data(dftemp, orig_data_column="process_temperature_K", gap_column="process_temperature_K_gaps", int_column=None):
    if int_column is not None:
        dff = dftemp[["UDI", gap_column, int_column]].copy()
        dff.loc[~dftemp[gap_column].isna(), int_column] = None

        # Plot the difference between original and imputed as a separate subplot
        diff = dftemp[orig_data_column] - dff[int_column]

        # Create subplots: first for temperature, second for difference
        fig = make_subplots(
            rows=2, cols=1,
            shared_xaxes=True,
            vertical_spacing=0.1,
            subplot_titles=("Process Temperature With and Without Gaps", "Original - Imputed Difference")
        )

        # Top plot: original, gaps, imputed
        fig.add_trace(
            go.Scatter(
            x=dftemp["UDI"],
            y=dftemp[orig_data_column],
            mode='lines',
            name='Original',
            line=dict(color='lightgrey')
            ),
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(
            x=dftemp["UDI"],
            y=dftemp[gap_column],
            mode='lines',
            name='With Gaps',
            ),
            row=1, col=1
        )
        dff = dftemp[["UDI", gap_column, int_column]].copy()
        dff.loc[~dftemp[gap_column].isna(), int_column] = None
        fig.add_trace(
            go.Scatter(
            x=dff["UDI"],
            y=dff[int_column],
            mode='lines',
            name='Imputed',
            line=dict(color='blue')
            ),
            row=1, col=1
        )

        # Bottom plot: difference
        fig.add_trace(
            go.Scatter(
            x=dftemp["UDI"],
            y=diff,
            mode='lines',
            name='Original - Imputed',
            line=dict(color='green')
            ),
            row=2, col=1
        )

        fig.update_layout(
            height=700,
            title_text="Process Temperature With and Without Gaps and Imputation Difference",
            xaxis_title="UDI",
            yaxis_title="Process Temperature [K]",
        )
        fig.update_yaxes(title_text="Process Temperature [K]", row=1, col=1)
        fig.update_yaxes(title_text="Difference", row=2, col=1)
    else:
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=dftemp["UDI"],
                y=dftemp[orig_data_column],
                mode='lines',
                name='Original',
                line=dict(color='lightgrey')
            )
        )
        fig.add_trace(
            go.Scatter(
                x=dftemp["UDI"],
                y=dftemp[gap_column],
                mode='lines',
                name='With Gaps',
            )
        )

        fig.update_layout(
            title="Process Temperature With and Without Gaps",
            xaxis_title="UDI",
            yaxis_title="Process Temperature [K]"
        )

    fig.show()


def plot_smoothed_temperature_data(dftemp, smoothed_column):
    # Calculate difference
    diff = dftemp["process_temperature_K"] - dftemp[smoothed_column]

    # Create subplots: first for temperature, second for difference
    fig = make_subplots(
        rows=2, cols=1,
        shared_xaxes=True,
        vertical_spacing=0.1,
        subplot_titles=("Process Temperature: Original vs Smoothed", "Original - Smoothed Difference")
    )

    # Top plot: original and smoothed
    fig.add_trace(
        go.Scatter(
            x=dftemp["UDI"],
            y=dftemp["process_temperature_K"],
            mode='lines',
            name='Original',
            line=dict(color='lightgrey')
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=dftemp["UDI"],
            y=dftemp[smoothed_column],
            mode='lines',
            name='Smoothed',
            line=dict(color='blue')
        ),
        row=1, col=1
    )

    # Bottom plot: difference
    fig.add_trace(
        go.Scatter(
            x=dftemp["UDI"],
            y=diff,
            mode='lines',
            name='Original - Smoothed',
            line=dict(color='green')
        ),
        row=2, col=1
    )

    fig.update_layout(
        height=700,
        title_text="Process Temperature: Original, Smoothed, and Difference",
        xaxis_title="UDI"
    )
    fig.update_yaxes(title_text="Process Temperature [K]", row=1, col=1)
    fig.update_yaxes(title_text="Difference", row=2, col=1)

    fig.show()

11.1 Visualization of the temperature data with and without gaps

plot_temperature_data(dftemp)

11.2 Forward fill imputation

Forward fill imputation (also known as last observation carried forward) is a simple and commonly used method for handling missing data in time series. It involves replacing missing values with the most recent non-missing value prior to the gap.

Advantages:

  • Simple to implement and computationally efficient.
  • Preserves the last known state, which can be useful in certain contexts.
  • Works well for short gaps where the last observation is a reasonable estimate for the missing values.

Disadvantages: - Can introduce bias if the last observation is not representative of the missing values. - May not capture trends or patterns in the data, especially for long gaps. - Can lead to unrealistic flat segments in the time series. - Does not account for seasonality or cyclic patterns in the data.

dftemp.loc[:, "temp_ff"] = dftemp["process_temperature_K_gaps"].ffill()  # Pandas provides forward fill imputation out of the box.

plot_temperature_data(dftemp, int_column="temp_ff")

11.3 Backward fill imputation

Backward fill imputation (also known as next observation carried backward) is another simple method for handling missing data in time series. It involves replacing missing values with the next non-missing value that follows the gap.

Advantages: - Simple to implement and computationally efficient. - Preserves the next known state, which can be useful in certain contexts. - Works well for short gaps where the next observation is a reasonable estimate for the missing values.

Disadvantages: - Can introduce bias if the next observation is not representative of the missing values. - May not capture trends or patterns in the data, especially for long gaps. - Can lead to unrealistic flat segments in the time series. - Does not account for seasonality or cyclic patterns in the data.

dftemp.loc[:, "temp_bf"] = dftemp["process_temperature_K_gaps"].bfill()  # Pandas provides backward fill imputation out of the box.

plot_temperature_data(dftemp, int_column="temp_bf")

11.4 Linear Interpolation

Linear interpolation is a method used to estimate missing values in a time series by connecting two known data points with a straight line and using that line to fill in the gaps.

Advantages: - Can provide more accurate estimates than forward or backward fill, especially for short gaps. - Preserves trends and patterns in the data better than simple imputation methods.

Disadvantages: - Assumes a linear relationship between data points, which may not always be valid. - Can introduce bias if the underlying data has non-linear trends. - May not perform well for long gaps or highly volatile data.

dftemp.loc[:, "temp_linear"] = dftemp["process_temperature_K_gaps"].interpolate("linear")

plot_temperature_data(dftemp, int_column="temp_linear")

11.5 Polynomial interpolation

Polynomial interpolation is a method used to estimate missing values in a time series by fitting a polynomial function of a given order to the known data points and using that function to fill in the gaps.

Advantages: - Can provide more accurate estimates than linear interpolation, especially for non-linear trends.

Disadvantages: - More computationally expensive than other methods. - Tends to overshoot or oscillate between known data points, leading to unrealistic estimates. - May not perform well for short gaps or sparse data.

dftemp.loc[:, "temp_poly2"] = dftemp["process_temperature_K_gaps"].interpolate("polynomial", order=2)

plot_temperature_data(dftemp, int_column="temp_poly2")
dftemp.loc[:, "temp_poly5"] = dftemp["process_temperature_K_gaps"].interpolate("polynomial", order=5)

plot_temperature_data(dftemp, int_column="temp_poly5")

11.6 Spline interpolation

Spline interpolation is a method used to estimate missing values in a time series by fitting piecewise polynomial functions (splines) to the known data points and using those functions to fill in the gaps.

The specific type PCHIP (Piecewise Cubic Hermite Interpolating Polynomial) in particular preserves the monotonicity of the data and avoids overshooting, making it suitable for many real-world applications.

Advantages: - Can provide smooth and accurate estimates, especially for non-linear trends. - Can capture complex patterns in the data better than simpler methods. - Less prone to overfitting compared to high-degree polynomial interpolation.

Disadvantages: - More computationally intensive than simpler methods. - Requires careful selection of spline parameters (e.g., degree, knots). - May not perform well for very short gaps or sparse data.

dftemp.loc[:, "temp_spline"] = dftemp["process_temperature_K_gaps"].interpolate("pchip")

plot_temperature_data(dftemp, int_column="temp_spline")

12 Data smoothing

Especially for noisy data and higher-order methods like splines or polynomials, interpolation methods can produce unrealistic results, such as overshooting or oscillations between known data points. Smoothing the time series prior to interpolation can help to mitigate this issue by reducing noise and capturing the underlying trend of the data.

Smoothing techniques, such as moving averages or rolling medians, can be applied to the time series before performing interpolation. Rolling median, by design, is more robust to outliers and preserves edges better than rolling mean. Rolling mean tends to smooth out the data more uniformly but is more sensitive to outliers.

When applying smoothing, it is important to choose an appropriate window size. The window-size determines the number of consecutive data points used to calculate the smoothed value at each point in the time series. The larger the window size, the smoother the resulting time series will be, but it may also lead to a loss of detail and important features in the data.

12.1 Rolling mean smoothing

dftemp["temp_mean_rolling3"] = dftemp["process_temperature_K"].rolling(window=3, center=True).mean()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_mean_rolling3")
dftemp["temp_mean_rolling5"] = dftemp["process_temperature_K"].rolling(window=5, center=True).mean()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_mean_rolling5")
dftemp["temp_mean_rolling10"] = dftemp["process_temperature_K"].rolling(window=10, center=True).mean()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_mean_rolling10")

12.2 Rolling median smoothing

dftemp["temp_median_rolling3"] = dftemp["process_temperature_K"].rolling(window=3, center=True).median()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_median_rolling3")
dftemp["temp_median_rolling5"] = dftemp["process_temperature_K"].rolling(window=5, center=True).median()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_median_rolling5")
dftemp["temp_median_rolling10"] = dftemp["process_temperature_K"].rolling(window=10, center=True).median()
plot_smoothed_temperature_data(dftemp, smoothed_column="temp_median_rolling10")

12.3 Applying smoothing prior to interpolation

Applying smoothing prior to interpolation can help to reduce noise and improve the accuracy of the imputed values, especially for noisy data.

Note that smoothing is applied on the gapped data, not on the original data. The argument min_periods=2 in the rolling mean function ensures that at least two non-NaN values are required to compute the mean, which helps to not further erode the gapped series.

dftemp.loc[:, "process_temperature_K_gaps_smoothed"] = dftemp["process_temperature_K_gaps"].rolling(window=10, center=True, min_periods=2).mean()
dftemp.loc[:, "temp_smoothed_pchip"] = dftemp["process_temperature_K_gaps_smoothed"].interpolate("pchip")

dftemp.loc[:, "process_temperature_K_gaps_smoothed_plot"] = dftemp.loc[:, "process_temperature_K_gaps_smoothed"]
dftemp.loc[dftemp["process_temperature_K_gaps"].isna(), "process_temperature_K_gaps_smoothed_plot"] = None  # Keep NaN of originally gapped column for plotting

plot_temperature_data(dftemp, gap_column="process_temperature_K_gaps_smoothed_plot", int_column="temp_smoothed_pchip")